In [879]:
import os
import Bio
import re
import timeit
import copy
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import pandas
In [880]:
lib5pr = []
for filename in os.listdir("abifiles/5prlib"):
handle = open("abifiles/5prlib" + "/" + filename, 'rb')
record = SeqIO.read(handle, "abi", alphabet=IUPACAmbiguousDNA())
lib5pr.append(record)
In [881]:
#sgRNAconst = SeqRecord(Seq("gttt"))
sgRNAconst = SeqRecord(Seq("GTTTAAGAG"))
# An alternative approach in which, rather than the T7 promoter being used, the first 9 nt of the sgRNA hairpin is used
# to find 20/21mers. This gets 45 targets...
sgRNAfiltlib5pr = copy.deepcopy(lib5pr)
for seqrecord in sgRNAfiltlib5pr:
fwdlocs = []
revlocs = []
fwdlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.seq), str(seqrecord.seq))]
#print fwdlocs
for item in fwdlocs:
start = ExactPosition(int(item))
end = ExactPosition(int((item) + len(sgRNAconst) + 1))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type="sgRNAconst", strand = +1)
seqrecord.features.append(feature)
#print fwdlocs
revlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.reverse_complement().seq), str(seqrecord.seq))]
for item in revlocs:
start = ExactPosition(int(item) - 1)
end = ExactPosition(start + len(sgRNAconst))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type="sgRNAconst", strand = -1)
seqrecord.features.append(feature)
#pick out 21mers before first 9nt of sgRNA hairpin
alltgts = []
for seqrecord in sgRNAfiltlib5pr:
for feat in seqrecord.features:
if feat.strand == 1:
tgtstart = int(feat.location.start) - 21
tgtend = int(feat.location.start)
sgtgt = seqrecord[tgtstart:tgtend]
alltgts.append(sgtgt)
#print "pos \n \n"
if feat.strand == -1:
tgtend = int(feat.location.end) + 21
tgtstart = int(feat.location.end)
sgtgt = seqrecord[tgtstart:tgtend].reverse_complement()
sgtgt.name=seqrecord.name
alltgts.append(sgtgt)
In [883]:
# search against amps300; get hits:
allamps = []
#for item in SeqIO.parse("amps144masked_iter0.fasta", "fasta"):
#allamps.append(item)
for item in SeqIO.parse("amps144.fasta", "fasta"):
allamps.append(item)
for item in SeqIO.parse("theextraamps.fasta", "fasta"):
allamps.append(item)
#for record in allamps:
#record.seq = "GGGGGGGGGGGGGGGGGGGGGGGGGG"
In [884]:
len(allamps[0])
Out[884]:
Next: re-generate that list of cut guides that you used to BLAST before. Are the hits (within amps300) actually within the expected PAM-adjacent cut sites?
In [32]:
# This part cuts the amps300 fragments and makes the theoretical guide list
supercutslist = []
for individual_sequence in allamps:
cutslist = []
amp = individual_sequence
substrate_name = amp.id
substrate = amp.seq
pos = HpaII.search(substrate)
# Positions in this list correspond to the right boundaries of fragments;
# last one is thus the sequence end
pos.append(len(substrate))
pos = iter(pos)
cuts = HpaII.catalyze(substrate)
for item in cuts:
cutslist.append([item, "HpaII", int(pos.next())])
cuts = BfaI.catalyze(substrate)
pos = BfaI.search(substrate)
pos.append(len(substrate))
pos = iter(pos)
for item in cuts:
cutslist.append([item, "BfaI", int(pos.next())])
cuts = ScrFI.catalyze(substrate)
pos = ScrFI.search(substrate)
pos.append(len(substrate))
pos = iter(pos)
for item in cuts:
cutslist.append([item, "ScrFI", int(pos.next())])
#The above is all to get the results of a catalyze operation (i.e. tuples) intp
# a list format. Next part makes them into SeqRecords.
i = 0
cutslistrecords = []
for item in cutslist:
cutslistrecords.append(SeqRecord(item[0], id = str(i), description = str(item[1]), name=str(item[2]), dbxrefs=[str(substrate_name)]))
i = i+1
cutslist = []
cutslist = cutslistrecords
# This part takes the 3' 20nt of each fragment and makes a new sequence with it.
# For the 5' end, the Mung-Bean treatment is simulated by removing two more nt (for HpaII and BfaI), or one nt for ScrFI;
# these would be the 5' overhang. Then we take the reverse-complement of the sequence.
# The Restriction module just returns sequences as if the top strand only was being cut. In other words,
# no bases are deleted from consecutive fragments.
# Eventually, this might be better re-implemented as SeqFeatures on the original sequence..
from Bio.Seq import MutableSeq
twentymers = []
record2 = []
for record2 in cutslist:
try: # This is because a second run of this code on already mutable seqs seems to fail. Not sure how to flush out and revert back to non-mutables...
record2.seq = record2.seq.tomutable()
except:
pass
if record2.description == "ScrFI":
#offset here (e.g. 1:21 is for simulating MBN digeston)
# because the entry.names are rooted on the right of each fragment, the length
# of the entry.name has to be subtracted to get the desired left position for the "reverse"
# tgts
entry = record2[1:21].reverse_complement\
(description=True, id=True, name=True)
entry.name = int(record2.name)+1 - len(record2.seq)
twentymers.append(entry)
else: # Should work for HpaII/BfaI
entry = record2[2:22].reverse_complement\
(description=True, id=True, name=True)
entry.name = int(record2.name)+2 - len(record2.seq)
twentymers.append(entry)
record2.id = str("%s_fwd" % record2.id)
entry = record2[-20:]
entry.name = int(record2.name)-20
twentymers.append(entry)
for item in twentymers:
item.dbxrefs = [substrate_name]
# The ends of the fragments aren't bonafide CRISPR targets; these can be removed:
noends = []
twentymerstr = [item for item in twentymers if item.description == "HpaII"]
trimmed = twentymerstr[1:-1] # removes first and last 20mer
noends.append(trimmed)
twentymerstr = [item for item in twentymers if item.description == "BfaI"]
trimmed = twentymerstr[1:-1]
noends.append(trimmed)
twentymerstr = [item for item in twentymers if item.description == "ScrFI"]
trimmed = twentymerstr[1:-1]
noends.append(trimmed)
cutslist = [item for sublist in noends for item in sublist]
supercutslist.append(cutslist)
In [33]:
#dude, flatten me
supercutslist2 = []
for item in supercutslist:
for subitem in item:
supercutslist2.append(subitem)
supercutslist = supercutslist2[:]
In [420]:
len(supercutslist) # These are all the calculated 20mers from the input substrate sequence
Out[420]:
In [35]:
# hitsumm is the subset of sequenced 20mers that find a hit in amps144 (and the extra amps)
yesitsok = [] # This will be a tuple of (sequenced 20mer, substrate 20mer) of matching items
notok = [] # This will be the sequenced 20mers that don't match a predicted cutsite. Probably the ends of DNA fragments.
for item in hitsumm:
madeahit = 0
for cut in supercutslist:
if item[1].seq[2:20] in cut.seq:
yesitsok.append((cut, item))
madeahit = 1
if item[1].reverse_complement().seq[2:20] in cut.seq:
yesitsok.append((cut, item))
madeahit = 1
if madeahit == 0:
notok.append(item)
In [ ]:
yesitsok[1][1][1]
In [ ]:
nohit = []
hitsumm = []
for item in notok:
tgt = item[1]
hitout = []
nohits = 0
for amp in allamps:
hit = [tloc.start() for tloc in re.finditer(str(tgt.seq[1:-1]), str(amp.seq))]
if len(hit) > 0:
hitout = (hit, tgt, tgt.reverse_complement().seq, amp.id, "fwd", amp.seq)
hitsumm.append(hitout)
if len(hitout) == 0: #make a marker of nohits =1 if there are no hits
nohits = 1
# and search again for the reverse complement of the target
for amp in allamps:
hit = [tloc.start() for tloc in re.finditer(str(tgt.reverse_complement().seq[1:-1]), str(amp.seq))]
if len(hit) > 0:
hitout = (hit, tgt, tgt.reverse_complement().seq, amp.id, "rev", amp.seq)
hitsumm.append(hitout)
if len(hitout) == 0: # if the rc search has no hits, and if the previous search also had no hits, add this target to a list
if nohits == 1:
nohit.append(tgt)
This is somewhat of a reboot: starting way back from the 21mers after T7 in sequencing data, BLAST against a newly made Amps144 db. The goal is to get the match locations and visualize them.
In [885]:
Bio.SeqIO.write(alltgts, "alltgtstemp.fa", "fasta")
blastn_cline = NcbiblastnCommandline(query="alltgtstemp.fa", db="amps144", \
task = "blastn-short",outfmt=5, out="alltgts.blast", max_target_seqs=100, num_threads = 7, evalue = 0.005)
timeit.timeit(blastn_cline, number =1)
Out[885]:
In [886]:
result_handle = open("alltgts.blast")
blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here
blast_records_list = []
for blast_record in blast_records:
blast_records_list.append(blast_record)
result_handle.close()
In [887]:
blastsandrecords = []
for i,j in enumerate(alltgts):
blastsandrecords.append((j, blast_records_list[i]))
In [887]:
In [888]:
i = 0
for item in blastsandrecords:
print(item[0].name + " " + item[0].seq) # Print out the query seq and its title, basically
for alignment in [item[1]]:
for item in alignment.alignments:
print item.title # Print out each Amp of amps144+extra amps that a match is made on
i= i+1
for hit in item.hsps:
print("--" + str(hit)) # Within each hit amp, print out each specific hit sequence
print "\n"
print i
In [889]:
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_start
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_end
#blastsandrecords[3][1].alignments[0].title.split()[1] # splits the generated BLAST alignment title on \
# whitespace; extracts the second element which \
# corresponds to the position on the scaffold
In [890]:
mappable = []
for tgt, blast in blastsandrecords:
for i, j in enumerate(blast.alignments):
try:
pcr = blast.alignments[i].title.split()[1]
start = blast.alignments[i].hsps[0].sbjct_start
end = blast.alignments[i].hsps[0].sbjct_end
match = blast.alignments[i].hsps[0].match
query = blast.alignments[i].hsps[0].query
print blast.alignments[i].title
print pcr
print i
print(" " * (blast.alignments[i].hsps[0].query_start - 1) + query)
print (" " * (blast.alignments[i].hsps[0].query_start - 1) + match)
print tgt.seq + "\n"
print
mapstring = (pcr, start, end)
mappable.append((tgt, mapstring, match))
except:
pcr, start, end = 0,0,0
In [891]:
len(mappable)
Out[891]:
In [892]:
ampsdict = {}
for item in allamps:
ampsdict[item.id] = item.seq
Problem: not all BLAST hit titles (sequences) seem to map to amps300 dict entries...
In [893]:
def printloc(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq
'''
This function accepts a query seq and a dictionary of subjectseqs, where the key (amp)
is contained in a field in queryseq, highlighting the location of queryseq in it.
Returns a string.
'''
subjectseq = SeqRecord(ampsdict[queryseq[1][0]])
#for seqrecord in subjectseq:
locstart = queryseq[1][1]
#print queryseq
locend = queryseq[1][2]
fwdlocs = []
revlocs = []
# Figure out which strand the BLAST hit is on
if locstart <= locend:
fwdlocs.append(locstart)
if locstart > locend:
revlocs.append(locend)
for item in fwdlocs:
start = ExactPosition(int(item))
end = ExactPosition(int((item) + len(queryseq[0].seq) + 1))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1)
subjectseq.features.append(feature)
for item in revlocs:
start = ExactPosition(int(item) - 2)
end = ExactPosition(start + len(queryseq[0].seq) -1)
location = FeatureLocation(start, end)
feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1)
subjectseq.features.append(feature)
#print subjectseq.features
mask = list((("-" * 9) + "^" )* int(round(len(subjectseq.seq)/10.0)))
for feature in subjectseq.features:
featstart = int(feature.location.start)
featend = int(feature.location.end)
if feature.strand == 1:
mask = mask[:featstart] + [">"] * int(featend-1 - featstart) + mask[featend-1:]
#context = subjectseq[featstart+1:featend+4]
context = subjectseq[featstart-10:featend+10]
if feature.strand == -1:
mask = mask[:featstart+1] + ["<"] * int(featend+1 - featstart) + mask[featend+1:]
#context = subjectseq[featstart-2:featend+2]
context = subjectseq[featstart-10:featend+10]
mask = "".join(mask)
# Add labels
masklab = list(" " * (len(subjectseq.seq)))
for feature in subjectseq.features:
featstart = int(feature.location.start)
featend = int(feature.location.end)
featname = str(feature.type)
masklab = masklab[:featstart] + list(str(featname)) + list(" " * (featend-1 - featstart - len(featname))) + masklab[featend-1:]
masklab = "".join(masklab)
#print subjectseq.name
lines = int(round(len(subjectseq.seq) / 100))
i = 0
fullstring = []
# Draw out the map, with three lines: subject seq, a marker/counter line with chevrons over features, then a
# feature label
while i <= lines:
indexstart = i*100
indexend = (i+1) * 100
if indexend > len(subjectseq.seq):
indexend = len(subjectseq.seq)
outstring = list(str(indexstart+1) + " " + subjectseq.seq[indexstart:indexend] + " " + str(indexend) + "\n" + \
str(indexstart + 1) + " " + mask[indexstart:indexend] + " " + str(indexend) + "\n" + \
str(indexstart +1) + " " + masklab[indexstart:indexend] + " " + str(indexend) + "\n")
i = i + 1
fullstring.extend(outstring)
fullstring = "".join(fullstring)
return (fullstring, context, subjectseq, fwdlocs, start, end, feature)
In [894]:
#t = printloc(mappable[3], ampsdict[mappable[3][1][0]])
t = printloc(mappable[6], ampsdict)
In [895]:
t[1]
Out[895]:
In [896]:
len(mappable)
Out[896]:
next up: trim the sequence files properly based on being followed by hairpin start. and include a map of the expected cut sites in that amplicon.
In [897]:
mappable[1]
Out[897]:
In [898]:
for number, item in enumerate(mappable):
if str("N") in item[0]:
print number
print item
In [899]:
mappable[7]
Out[899]:
In [900]:
t = printloc(mappable[10], ampsdict)
#t = printloc(mappable[10], ampsdict["9981706"])
In [901]:
print t[1].seq
In [902]:
mappable[-1]
Out[902]:
In [906]:
primers_fwd = '''\
Fwd
TCCCTTTCTTTCCCGTTACC
AGAATAGGGACCGCATTGAC
GAGAGTCGGCGAGTCCATAA
AGAGCTGAAGCACCACAGGT
GCCGGATTCACTCAGATCA
ACCAGCCAACCAGACCATTA
GCCCGCTCCATTATAACAAG
TTCGTGCAATGGACAAGTAGA
GTTGACCCAAATGACCCATC
TGGTGAAGTGTGACGTAGCC
GGAATTGACATGGACAATGG
GGCTAAACCGGAATGAACAA
TGGGTCCCAATGATGAGTCT
CAGAGCTGCCTCATGACACT
GCAATGTGATGCGAAGTGTA
TAATCAGAGACTCGCCAACG
AAAGGATTGTGGACTCATGG
TGGACGCAGTTGTCATGATT
CGAAGAGGGATGCAGGACTA
AAGCGCCCTTCTTTCCTAAT
TAACAAGCCATTTGCCACCT
TTTCTGGAAGTTTGGACACTG
TGACTCAGAGGGTAGCAACT
ACCAAGAGTCCTCATGCACT
ACAATTCGCCAATCATTGCT
TCACACTTAACTGGGAGAATG
ATCATCCTGCGCCTAAGGTT
GCACTCTACACAAAGTTCTCG
CTGGGCCCAAAGTATCTCAT
TGTCACCCACTAATGTTTCAGG
CAAAGCTCACGTCAAATAAACG
GAACCAGAATGAGTGCTGTCC
AAAGACGGCCAGTATGCAGT
CAGTGTTCATCGGAACAAAGC
AGATCTTGGAGGCCCTGTTT
TGTGATTATGCAGAGGACAACC
TGCTCAATTACGGGTTTGGT
TTGGCCATACTTCAGCCAAT
CCGACCTGAACCCTCCTAAT
CTGTCTGTCTCTACCAATCACC
TGTGCTCTGTTGATGCGTCT
TCCTTCTCAGTATGCGCTGA
TGCAAGAGCGTCTGAATTTG
TATATTGCCTGGGCGCTAAC
TGTCACAACCCACTGATTCC
CCACTGATATAGTGTGGGCTAA
GTTACTGCCGTGAGGGATGA
AGTGATGGGTCTGCCAGAAT
AAACATGGTAAGCATCTGTGG
CAGTTATGGCTGCCTCGAA
GGGATTAGGGAGGATCAGGA
GCCAGGAATTGGCAGTAGTC
GACACGGGAAAGAAACATGA
TTTCAGTAGCCGCATCAGTG
CCAATTAAGCAGATTGGAGTTC
CCTTGTAATCCTACTGTGCCTA
GGCTTGCTCTGAGAAGGCTAT
TGCTGGAGTCCACCTGATTA
GACTGAACCGTCATTCCGATA
CGCCCACCAACTGAACTTAG
AGTGTGACGTCAGAGGCAAG
TTGCATTATTATGCGCTACTGG
GTTGTGAAATCTATTGCCTCCA
AGACACAATCTAATGAGGGATG
TTGGATGAGGTTGAGGCTTA
AGACTCCTGAGAGCCCATTT
CGTGCGATTGTTTCAGGTTT
GGAACGGTGTGTATGTCCAA
CCAAACCTAGGTGGTTCTCG
GGAACACTCATTAGGGAGCA
TCTTTACAGCACCTGCTTCTGA
GAGCCGAATAAAGTGACAAA
TGTGAATCAATCTGTCTTACGC
TATGATTGAGGGCCTTGTGG
CCAGTTCCAGGTGTGCCTA
CAGTGCCCACAAGGAGTAGG
AGAATAGGTGGATTCACTGAGG
AGTTGGGCAGGCCTAACATT
CCAATGGGCAGGAACTTATG
TCATCAACAACTGGAGTCTGC
ACGATGCAGCAATTCCCTAC
GGTACTGCCATCACCCTTGT
GCAGTGTGAGCCCAACAGTA
AGCCTGGACCTCTCCTTGAT
CCCATAAGTGCCGACTTCA
CCAGAAAGTAGGAGCCGATG
TCCCGGCTCTAAAGTAGTCTTG
AAAGTCAAGGGCTGCCATC
GGGAGAGCCCTTGGAATAAA
AACGATGTACAACACCAGTTGC
GCCAAGGATGAAACCAAATC
AATGGATCAATACCCTGTCC
CCACATAGCTTCCCTGTTCTTT
TGGTCATACCACACCAATGAA
CCAAGCTAGGCTTGAACTGG
TAGCCGCTTCGCAGTTTAAT
CCACCCTTCAGACTGGCTAC
AGGAAGGACATGGAATTAACTG
AATGCCCTCAAGTAGCATGG
GTCTTGAGGAAGCAGCAACC
TTTGCCCGGTGATAGAATGT
TCATGAGTTGCATATGGATGG
GTTCATTGATGGGTGCCAGT
TGGTGAACCTGTATCAAATACG
TCAGGAATGCCTTACTTGAGA
CTTGCAGGAACTTATGAACACA
TGAATGGATCCACCACAGAA
ATCCCAAGGGAACACGTAAG
GCCCACAGATTGCATTCAC
CGGCCCTGTCTCACAGTAA
CCAGGGTATTCTAACCCTATGC
TGGAGAATCCCAAGGATGTT
AACGTGCAACCTTTGAGTCC
TCCTCCTAAAGAAACGACGTG
TCCAAGCACTCCAACCTTGT
TTTCTGATGGGCCTCTGG
TCCTCGTAAGAGGTGTTTCCA
CACCCAACTCTTATGGTGGAA
ACCCGCCTCAATACCAAAGT
TCAGAATGGCTATGGCTGTG
CTAGCGGTTTATGAGCGTCAC
CCAACTCACACTCCAATAATCA
TCCATTGTAGCCGTGCTGTA
GCATTGCAGTTCCAATCAGA
GGCTGGACAAATACCACTGC
TCGTCAGAAGTTGTCCAAGG
CCACTATGGCCAACAAGAGAG
CCTGTGGGAAGTTATGAGACG
CTATTTGACCCGCAGTTTCC
TGGTTGCTCACATCACTGAA
CTCTTTGCAGATGAGCGTGA
GGAAGCTACTGCCGTCGTTA
TCCTTTATTGTCCCGCCATA
CCACATGTGCCTTGGTAAGA
CAACAGCAATCACCCTTCAA
AGGAGGATTATTGCACCCATA
TGTGCATCACACACTCTGGA
CTGGGACCACAGGGATAAAG
GTCGCACACATAAACGCAGT
AACAGGATCGGAGAGCATTG
ACACTCATGATAGTGACCTGCT
GGGAACCGTAGAGTTTATTGTG
AGGTTCCACAAGGAGGGAGT
ACATTGGCCTTGATCCTGAG
'''
In [907]:
primers_rev = '''\
Rev
GCGCCTAAGTGTCTTTGCAT
CAAGTGCAGAGCACCTTGAC
TGCCAACGTTTGTCTCTGAC
ATCACATGTGTCTCCAGGAA
GGCAGTTGGGACGTATTTGT
GCCAGTACCTGCCAGTAACC
TTGCTGGCACATTACCACTC
TTGGCTCCTGGACTGTCTTC
GAAACAGCCGTGTCCAGAT
TTCTGCAACGAACTGTCTCTG
GTTTCGGACCCACAATGG
TGTCACTAAAGCCTAGCAGAAA
CCCAGGAGATGGTCATAATC
GCCCTGAGTATCGGCATACA
TTTGTCAGCTTGTGGACCTG
AGGAATCCTATGCTATTTCTCG
CCCATGGTCCTTACAGACTGA
GAGCGTAGCACCACTTACGG
AACTGGTATGAATGCGCAAC
CCTGCTATCTCATCTTCCTTCA
CGAAATCCGGAAATCTCTGT
TCGGTCAGAATCACATCTGC
GCACTGGGATCTCAGGTTTG
CATTCTTCACGCTTGTTCCA
GCTGAAAGATACCTGCCAACA
GGGCAGGCTCTCTTAGTCAA
GCTGCTTGAATAATTCGTCTGC
TGACCGGAAATGTTGGAAAC
CTTGCACAAGTTGCTTCACA
GCTGTACCCGTGTAGGCTTT
GGCAAAGGGCTCCAGATATAA
CCCAGGAATAGAAGTCACGTTT
CTAATTGAATGCGTTCATGC
TAGGATGCTGCCCTATGGTC
GTCACCGACCATTCATTTCA
TTTCATCCCTTGTCATCCAT
GGGCTCTGGTCAAATGAT
CTTCTGCTCATGGGTTTGGT
CCAGTCTAGTGGCCAGGATT
AATTGAAGAGGAGACCCTGCT
AGGGCATCTCCAATGGTGTA
CTGCTGTACATCCAGGCTGA
CCATTACGGATGTAGTTCAGCA
GCAGTCGAGGCTTTGAGTCT
GCAAACCTTCAGGAGCATGT
AAAGACCCAGCAGGAATTGA
GGGTGTCTTAGAGGGTAACAAA
CACTCCATACAAAGCGCTCA
AATGGAAATCGCCACTATACG
ACAGACCCGCCCTGATGAT
CATTGACATGACACATTTCTCG
AGCTCTGTCCCAGGGTATAGT
AAGCCGTAAAGTGGAAGCAG
TGAGCTAACATTCTCAAGTCCA
TAATTGGGCTGTCCTTCACC
CCAGCTCAAGTTCGAGGAAA
ACATTCGCCGTAAAGCAAAG
AAATCCATTGGGCCTGCT
AAGGGACCATCTGGGTATGT
TGACCTGTACAACACCTTGTGA
TGTGCTACTGCCATGTACCC
CCATCTTAGGCCAACTTCCA
ACCCATCCTGGCACACTGTA
ACCCAAGGGTCTCACACTTC
GGGATTGAGTCAGGTGGGTTA
ATCCAAGTCCTGCCTGAGGT
TTTCATGTGAGGTTGCCAAT
AACACTTGTGTATCGGCCATC
TGAGCAACTTATTGAGGCACA
AGGGTTAGACGACTGCCAAG
TGCATTTAGACGTTTGGTTG
TCGAACTATCATCCCGCAGT
TCACCTGATGCCTTGGTAAA
CCCTCCAAATGAAGTGACCT
CCACCCAGGATCTATTTAGAGG
TTCGGCATCGCTTATTTACG
CTTTACGGATTGGGCAAGAA
TGACCCACTCAGCATAATGAA
TTTCTGGCAAGCACTCAGAA
AAACAAGGACATGCCACACA
TTCAATCCAAACGATGCAGA
TGCACCAGTCTATTCGGTCA
CCTGCATGCCTAGGGTATATT
CCCTGTGGTTGTCTAGCGTA
CCTAAGGCGCAATAGTGTGG
CTATCCAGAACCTCCCAGCA
AGAATACCACTGCTTGCTGAGA
CCTCTGCTGGCTACAGTTTG
CCATAAACCTTGGACGCAAC
TTTGAGTTGCCTGAACGTGA
GGTCTTCTTGGCCTTCCTAAA
GGTAGATACCCGTGGAATGC
GGGAGGGTATCCACATGAGA
GGAAGTGTAAGCTAAGGCTCA
ATTTCACGGCAAGCCAATTA
TTTGTCGCGCATCACTTT
CATTCCCTAAGGCATTTGTTTC
CCCTTAGAGGACAACGGAGA
TCGAGCATGGTCTGCATTAG
ACCTCTGTTGGTCCCTATGC
GGATTACAGTGGCCATATCGTT
TTAAGGAGCTGATGATTCCAG
CTACATGCCTTGGGCTTAGG
CTCAGGGTTCCTGTGCTCTC
CCCTCTTAGGGTATACGGGTTA
AATTTGGGTCGTGCGTATGT
ACTCCACTGAGGCCCAGATA
TTGGAAGGGCCATGTATAGG
AAGGGACGGTTTAGGGTCAG
CACGTGAGCTTCGGATGTTA
TTGAATGCATAGCACCTTTG
TGGCCTTGATCCTTCAGTTC
ATTTCAAATGCCCAAACGAC
TAACACCATGGCCGAGATTT
AAGTTCTCCAGGCGAATCAG
AGGTGTTTACCGAAGGCAGA
CGAGCTGTTGGTCATTGCTA
TCCAACCATTCCAAAGTCAA
CATCAGCACAAGCAGTCGTT
TGGTCTGATGTGACGAAAGC
TCTATCCATGGAGTCATTTGG
CTGGATGGCCAACTTCTGTC
GGAGACAAGACCGTGACACA
AACCTTGGCCAGGTATTATG
CCACATTTGTAAACGGCTCA
CCCATATTTGCGACATGTGTT
CCCATCAGCATACCACCTTC
CCCAGATTCCTGCCCATT
GATTTCGGGTGCATTGTCTT
GGACAACAGCTATGGCTTGC
TGTGTGTTATGGCGATGTCC
TTGCCTATAATTGAGCCAGAGA
AAACCATAGATCCTGGTTCAG
TTCAGATACTTCATCCTCAACC
ATGTATATTCACGCCTGTGG
GCTGCACAGAGATTCGATGA
CATGTCCAGGCAGTCCAAT
GGCAGGGTCCATCTACAGTT
CCCTCTCTCGGCTCCTATCT
TGCTCTTCAAAGTGCCTCCT
GGCCATCTGAGACTTTGCAC
CTGCAGCAATGGCCTTAAAT
AACTTGAGCGCAGGGAACT
AGATCTGGCTCCCATGCAC
'''
In [908]:
primers_fwd = pandas.read_table(io.BytesIO(primers_fwd))
primers_rev = pandas.read_table(io.BytesIO(primers_rev))
In [909]:
fwdprimerlist = []
for index,item in [list(x) for x in primers_fwd.itertuples()]:
fwdprimerlist.append(item)
revprimerlist = []
for index,item in [list(x) for x in primers_rev.itertuples()]:
revprimerlist.append(item)
In [910]:
revprimersreversed = []
for item in revprimerlist:
item = Seq(item, IUPACAmbiguousDNA()).reverse_complement()
revprimersreversed.append(item)
revprimerslist = revprimersreversed
In [911]:
revprimerlist = copy.deepcopy(revprimersreversed)
In [912]:
len(revprimersreversed)
Out[912]:
In [913]:
f = []
for item in fwdprimerlist:
item = Seq(item, IUPACAmbiguousDNA())
f.append(item)
fwdprimerlist = copy.deepcopy(f)
In [914]:
?item.
In [915]:
fwdprimerlist
Out[915]:
In [916]:
ampsdict["10206893"]
Out[916]:
In [917]:
fwdprimerlist.extend(revprimerlist)
In [918]:
f = []
for index, item in enumerate(fwdprimerlist):
item = SeqRecord(item)
item.id = str(index)
item.description = str(index)
item.name = str(index)
f.append(item)
In [919]:
with open("288primers.fasta", "w") as handle:
SeqIO.write(f, handle, "fasta")
Made a BLAST database from 288 primers
In [920]:
# To match against the primer BLAST database, which of the mappable 20mers are basically fragment ends?
# First, write out a query fasta containing the mappable targets:
m = [j for j,k,l in copy.deepcopy(mappable)]
In [921]:
for item in m:
item.id = item.name
In [922]:
Bio.SeqIO.write(m, "mappablesequencedtgts.fasta", "fasta")
Out[922]:
In [923]:
blastn_cline = NcbiblastnCommandline(query="mappablesequencedtgts.fasta", db="288prim", \
task = "blastn-short",outfmt=5, out="288prim.blast", max_target_seqs=100, num_threads = 7, evalue = 0.01)
timeit.timeit(blastn_cline, number =1)
Out[923]:
In [924]:
result_handle = open("288prim.blast")
prim_blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here
prim_blast_records_list = []
for blast_record in prim_blast_records:
prim_blast_records_list.append(blast_record)
result_handle.close()
In [925]:
counter = 0
for item in prim_blast_records_list:
for i, h in enumerate(item.alignments):
print i
for j in h.hsps:
print j
counter = counter + 1
print counter
In [ ]:
In [940]:
# Generate a list of the mappable targets that didn't get matches against the primer library
mappable_noends = []
for item in prim_blast_records_list:
if len(item.alignments) == 0:
mappable_noends.append(item.query.split()[0])
So, 25 of the 42 mappable matches are the ends of PCR products. Next, filter the mappable list to remove these. Then check PAM-adjacency.
In [941]:
mappable_noends
Out[941]:
In [950]:
len(mappable[0])
Out[950]:
In [1002]:
d = []
for item, j, k in mappable:
if item.name in mappable_noends:
d.append((item, j, k))
In [1013]:
In [1049]:
pamlist = []
for item in d:
t = printloc(item, ampsdict)
if t[1].features[0].strand == 1:
pamlist.append((t, t[2][t[-2]-1:t[-2]+2]))
if t[1].features[0].strand == -1:
pamlist.append((t, t[2][t[-3]-2:t[-3]+1].reverse_complement()))
In [1047]:
print(t[0])
In [1037]:
print(t[1])
In [1064]:
for item, (j, otherjunk) in enumerate(pamlist):
print item
print otherjunk
print "\n"
In [1070]:
print pamlist[14][0][0]
In this analysis, 6 items are not next to NAG/NGG. But 3 are probably leftover fragment end dupes, leaving 14/17 correct, or an 82% yield.
1 is in a primer (989347).
6 (141439)
7 (not in primer...),
8 (not in primer),
9 (not in primer)
14. (is in 4995367)
In [ ]: